I. Preparing data
# clean data
Fullerton <-
read.csv("FullertonHousing.csv") %>%
mutate(LOT_SIZE = ifelse(is.na(LOT_SIZE), SQUARE_FEET, LOT_SIZE),
ZIP = as.factor(ZIP)) %>%
select(PROPERTY_TYPE,ZIP,PRICE,DAYS_ON_MARKET,YEAR_BUILT,LOT_SIZE,SQUARE_FEET)
# check sample size
table(Fullerton[,1:2]) %>%
kable(caption = "Sample sizes per zip code per property tpye") %>%
kable_styling(bootstrap_options = "striped", full_width = F)
Table 1: Sample sizes per zip code per property tpye
|
|
92831
|
92832
|
92833
|
92835
|
|
Condo/Co-op
|
10
|
13
|
3
|
10
|
|
Single Family Residential
|
48
|
26
|
16
|
56
|
|
Townhouse
|
7
|
2
|
0
|
4
|
II. Clustering per zip code
ZIP 92835
EDA
zip_code = 92835
subset = Fullerton %>% filter(ZIP == zip_code) %>% select(-ZIP) %>% droplevels()
X = subset[,-1] %>% as.matrix()
rownames(X) = paste0('id_', 1:nrow(X))
subset %>%
reshape2::melt(id.vars = 'PROPERTY_TYPE') %>%
ggplot(aes(x = value, fill = PROPERTY_TYPE))+
geom_histogram() +
scale_fill_jco() +
facet_wrap(~variable, scales = "free") +
theme_minimal() +
labs(x = "")

clPairs(X, subset[,1])

Hierarchical clustering
# scale the data
X_scaled = scale(X)
# compute the euclidean distance
d <- dist(X_scaled, method = "euclidean")
# compute hierarchical clustering
hc <- hclust(d, method = "complete" ) #Options: "average", "single", "complete", "ward"
Optimal clusters
# elbow method for optimal clusters
fviz_nbclust(X, FUN = hcut, method = "wss")

# # Other methods
# # a. average silhouette method
# fviz_nbclust(X, FUN = hcut, method = "silhouette")
# # b. gap statistic method
# gap_stat <- cluster::clusGap(X, FUN = hcut, nstart = 25, K.max = 10, B = 50)
# fviz_gap_stat(gap_stat)
Cut-off tree
# cut tree
cut_off <- 4
myCluster <- cutree(hc, cut_off)
# plot dendrogram
fviz_dend(hc,
k = cut_off,
cex = 0.5,
k_colors = "jco",
color_labels_by_k = FALSE,
horiz = TRUE,
ggtheme = theme_minimal(),
main = ""
)

Model-based clustering
Find optimal components
BIC <- mclustBIC(X) # not necessary to sclae
plot(BIC)

summary(BIC)
## Best BIC values:
## VEE,3 VEE,5 EEE,5
## BIC -5617.057 -5618.739033 -5623.865588
## BIC diff 0.000 -1.682532 -6.809087
# There are other criteria, such as ICL (integrated classification likelihood)
# ICL <- mclustICL(X)
# plot(ICL)
# summary(ICL)
Fit with optimal components
model <- Mclust(X, x = BIC)
summary(model, parameters = TRUE)
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm
## ----------------------------------------------------
##
## Mclust VEE (ellipsoidal, equal shape and orientation) model with 3 components:
##
## log-likelihood n df BIC ICL
## -2736.304 70 34 -5617.057 -5622.855
##
## Clustering table:
## 1 2 3
## 21 42 7
##
## Mixing probabilities:
## 1 2 3
## 0.28231665 0.61814819 0.09953516
##
## Means:
## [,1] [,2] [,3]
## PRICE 667887.99878 968095.62657 3.900169e+05
## DAYS_ON_MARKET 51.53225 43.44039 6.448473e+00
## YEAR_BUILT 1965.52891 1973.82742 1.976135e+03
## LOT_SIZE 6927.12401 11176.95928 8.381570e+02
## SQUARE_FEET 1550.83838 2654.35600 1.022338e+03
##
## Variances:
## [,,1]
## PRICE DAYS_ON_MARKET YEAR_BUILT LOT_SIZE
## PRICE 19657630003.0 193609.292560 -1.778730e+05 216367710.345
## DAYS_ON_MARKET 193609.3 162.270992 -1.612462e+00 2438.303
## YEAR_BUILT -177873.0 -1.612462 5.845489e+01 -13685.932
## LOT_SIZE 216367710.3 2438.302617 -1.368593e+04 8718113.292
## SQUARE_FEET 52212490.8 894.193568 -1.751302e+02 303765.679
## SQUARE_FEET
## PRICE 52212490.8166
## DAYS_ON_MARKET 894.1936
## YEAR_BUILT -175.1302
## LOT_SIZE 303765.6786
## SQUARE_FEET 181761.2243
## [,,2]
## PRICE DAYS_ON_MARKET YEAR_BUILT LOT_SIZE
## PRICE 1.026058e+11 1.010571e+06 -9.284334e+05 1129361761.15
## DAYS_ON_MARKET 1.010571e+06 8.469963e+02 -8.416474e+00 12727.06
## YEAR_BUILT -9.284334e+05 -8.416474e+00 3.051135e+02 -71435.65
## LOT_SIZE 1.129362e+09 1.272706e+04 -7.143565e+04 45505421.15
## SQUARE_FEET 2.725305e+08 4.667369e+03 -9.141168e+02 1585547.78
## SQUARE_FEET
## PRICE 2.725305e+08
## DAYS_ON_MARKET 4.667369e+03
## YEAR_BUILT -9.141168e+02
## LOT_SIZE 1.585548e+06
## SQUARE_FEET 9.487283e+05
## [,,3]
## PRICE DAYS_ON_MARKET YEAR_BUILT LOT_SIZE
## PRICE 2753589304.48 27120.2824124 -2.491599e+04 30308221.945
## DAYS_ON_MARKET 27120.28 22.7304954 -2.258695e-01 341.551
## YEAR_BUILT -24915.99 -0.2258695 8.188207e+00 -1917.089
## LOT_SIZE 30308221.94 341.5510418 -1.917089e+03 1221210.467
## SQUARE_FEET 7313788.91 125.2562921 -2.453178e+01 42550.700
## SQUARE_FEET
## PRICE 7313788.90795
## DAYS_ON_MARKET 125.25629
## YEAR_BUILT -24.53178
## LOT_SIZE 42550.70035
## SQUARE_FEET 25460.63605
Plot the results
plot(model, what = "classification")

ZIP 92831
EDA
zip_code = 92831
subset = Fullerton %>% filter(ZIP == zip_code) %>% select(-ZIP) %>% droplevels()
X = subset[,-1] %>% as.matrix()
rownames(X) = paste0('id_', 1:nrow(X))
subset %>%
reshape2::melt(id.vars = 'PROPERTY_TYPE') %>%
ggplot(aes(x = value, fill = PROPERTY_TYPE))+
geom_histogram() +
scale_fill_jco() +
facet_wrap(~variable, scales = "free") +
theme_minimal() +
labs(x = "")

clPairs(X, subset[,1])

Hierarchical clustering
# scale the data
X_scaled = scale(X)
# compute the euclidean distance
d <- dist(X_scaled, method = "euclidean")
# compute hierarchical clustering
hc <- hclust(d, method = "complete" ) #Options: "average", "single", "complete", "ward"
Optimal clusters
# elbow method for optimal clusters
fviz_nbclust(X, FUN = hcut, method = "wss")

Cut-off tree
# cut tree
cut_off <- 4
myCluster <- cutree(hc, cut_off)
# plot dendrogram
fviz_dend(hc,
k = cut_off,
cex = 0.5,
k_colors = "jco",
color_labels_by_k = FALSE,
horiz = TRUE,
ggtheme = theme_minimal(),
main = ""
)

Model-based clustering
Find optimal components
BIC <- mclustBIC(X) # not necessary to sclae
plot(BIC)

summary(BIC)
## Best BIC values:
## VEE,6 VEE,4 VEE,5
## BIC -5146.957 -5153.126429 -5158.46020
## BIC diff 0.000 -6.169585 -11.50335
Fit with optimal components
model <- Mclust(X, x = BIC)
summary(model, parameters = TRUE)
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm
## ----------------------------------------------------
##
## Mclust VEE (ellipsoidal, equal shape and orientation) model with 6 components:
##
## log-likelihood n df BIC ICL
## -2458.683 65 55 -5146.957 -5149.174
##
## Clustering table:
## 1 2 3 4 5 6
## 9 17 20 6 4 9
##
## Mixing probabilities:
## 1 2 3 4 5 6
## 0.13841098 0.24952692 0.32122383 0.09080995 0.06157002 0.13845830
##
## Means:
## [,1] [,2] [,3] [,4] [,5]
## PRICE 481361.71796 653596.44670 848196.20168 662465.01552 1.375982e+06
## DAYS_ON_MARKET 68.19777 61.27828 44.24036 16.16969 3.251576e+01
## YEAR_BUILT 1973.77774 1957.48136 1955.65164 1955.87847 1.969746e+03
## LOT_SIZE 1250.11315 8091.01965 11785.36048 7467.40520 4.299286e+04
## SQUARE_FEET 1415.78910 1440.56997 2214.82928 1432.38862 3.000589e+03
## [,6]
## PRICE 409229.50300
## DAYS_ON_MARKET 25.36325
## YEAR_BUILT 1972.66828
## LOT_SIZE 949.99183
## SQUARE_FEET 1099.30977
##
## Variances:
## [,,1]
## PRICE DAYS_ON_MARKET YEAR_BUILT LOT_SIZE
## PRICE 7531919254.5 -654708.29979 334073.69674 103477482.100
## DAYS_ON_MARKET -654708.3 205.38325 -41.95925 -4967.801
## YEAR_BUILT 334073.7 -41.95925 35.02141 4018.682
## LOT_SIZE 103477482.1 -4967.80132 4018.68190 3477566.051
## SQUARE_FEET 21947429.7 -1900.59588 984.05288 373156.950
## SQUARE_FEET
## PRICE 21947429.7422
## DAYS_ON_MARKET -1900.5959
## YEAR_BUILT 984.0529
## LOT_SIZE 373156.9499
## SQUARE_FEET 89588.7694
## [,,2]
## PRICE DAYS_ON_MARKET YEAR_BUILT LOT_SIZE
## PRICE 5975207116.2 -519391.87871 265026.67682 82090549.104
## DAYS_ON_MARKET -519391.9 162.93423 -33.28703 -3941.046
## YEAR_BUILT 265026.7 -33.28703 27.78312 3188.093
## LOT_SIZE 82090549.1 -3941.04620 3188.09269 2758815.743
## SQUARE_FEET 17411291.1 -1507.77692 780.66686 296032.125
## SQUARE_FEET
## PRICE 17411291.0596
## DAYS_ON_MARKET -1507.7769
## YEAR_BUILT 780.6669
## LOT_SIZE 296032.1250
## SQUARE_FEET 71072.3834
## [,,3]
## PRICE DAYS_ON_MARKET YEAR_BUILT LOT_SIZE
## PRICE 54071055490 -4700099.3520 2398288.7740 742856699.28
## DAYS_ON_MARKET -4700099 1474.4302 -301.2221 -35663.45
## YEAR_BUILT 2398289 -301.2221 251.4160 28849.80
## LOT_SIZE 742856699 -35663.4546 28849.8011 24965172.96
## SQUARE_FEET 157558871 -13644.2282 7064.4381 2678864.37
## SQUARE_FEET
## PRICE 157558870.636
## DAYS_ON_MARKET -13644.228
## YEAR_BUILT 7064.438
## LOT_SIZE 2678864.372
## SQUARE_FEET 643150.724
## [,,4]
## PRICE DAYS_ON_MARKET YEAR_BUILT LOT_SIZE
## PRICE 3220269499.0 -279920.30946 142833.09469 44241762.051
## DAYS_ON_MARKET -279920.3 87.81154 -17.93966 -2123.982
## YEAR_BUILT 142833.1 -17.93966 14.97339 1718.186
## LOT_SIZE 44241762.1 -2123.98175 1718.18607 1486832.175
## SQUARE_FEET 9383616.1 -812.59912 420.73147 159543.126
## SQUARE_FEET
## PRICE 9383616.0736
## DAYS_ON_MARKET -812.5991
## YEAR_BUILT 420.7315
## LOT_SIZE 159543.1262
## SQUARE_FEET 38303.6477
## [,,5]
## PRICE DAYS_ON_MARKET YEAR_BUILT LOT_SIZE
## PRICE 101520106921 -8824584.326 4502862.5862 1394736811.67
## DAYS_ON_MARKET -8824584 2768.289 -565.5540 -66959.26
## YEAR_BUILT 4502863 -565.554 472.0414 54166.41
## LOT_SIZE 1394736812 -66959.257 54166.4087 46872897.26
## SQUARE_FEET 295821734 -25617.468 13263.7048 5029652.09
## SQUARE_FEET
## PRICE 295821733.98
## DAYS_ON_MARKET -25617.47
## YEAR_BUILT 13263.70
## LOT_SIZE 5029652.09
## SQUARE_FEET 1207535.71
## [,,6]
## PRICE DAYS_ON_MARKET YEAR_BUILT LOT_SIZE
## PRICE 5415200499.2 -470713.58468 240187.92399 74396882.626
## DAYS_ON_MARKET -470713.6 147.66376 -30.16731 -3571.685
## YEAR_BUILT 240187.9 -30.16731 25.17924 2889.299
## LOT_SIZE 74396882.6 -3571.68462 2889.29919 2500254.820
## SQUARE_FEET 15779475.1 -1366.46549 707.50143 268287.489
## SQUARE_FEET
## PRICE 15779475.1219
## DAYS_ON_MARKET -1366.4655
## YEAR_BUILT 707.5014
## LOT_SIZE 268287.4886
## SQUARE_FEET 64411.3582
Plot the results
plot(model, what = "classification")

III. Clustering per house type
Single Family Residential
EDA
property_type = "Single Family Residential"
subset = Fullerton %>% filter(PROPERTY_TYPE == property_type) %>% select(-PROPERTY_TYPE) %>% droplevels()
X = subset[,-1] %>% as.matrix()
rownames(X) = paste0('id_', 1:nrow(X))
clPairs(X, subset[,1])

Hierarchical clustering
# scale the data
X_scaled = scale(X)
# compute the euclidean distance
d <- dist(X_scaled, method = "euclidean")
# compute hierarchical clustering
hc <- hclust(d, method = "complete" ) #Options: "average", "single", "complete", "ward"
Optimal clusters
# elbow method for optimal clusters
fviz_nbclust(X, FUN = hcut, method = "wss")

Cut-off tree
# cut tree
cut_off <- 4
myCluster <- cutree(hc, cut_off)
# plot dendrogram
fviz_dend(hc,
k = cut_off,
cex = 0.5,
k_colors = "jco",
color_labels_by_k = FALSE,
horiz = TRUE,
ggtheme = theme_minimal(),
main = ""
)

Model-based clustering
Find optimal components
BIC <- mclustBIC(X) # not necessary to sclae
plot(BIC)

summary(BIC)
## Best BIC values:
## VVE,3 VVE,4 VVE,2
## BIC -11637.99 -11655.56603 -11673.11340
## BIC diff 0.00 -17.57758 -35.12495
Fit with optimal components
model <- Mclust(X, x = BIC)
summary(model, parameters = TRUE)
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm
## ----------------------------------------------------
##
## Mclust VVE (ellipsoidal, equal orientation) model with 3 components:
##
## log-likelihood n df BIC ICL
## -5714.338 146 42 -11637.99 -11666.2
##
## Clustering table:
## 1 2 3
## 40 55 51
##
## Mixing probabilities:
## 1 2 3
## 0.2591771 0.3791035 0.3617193
##
## Means:
## [,1] [,2] [,3]
## PRICE 622403.45582 701854.96233 1.076618e+06
## DAYS_ON_MARKET 50.38175 43.71939 4.458788e+01
## YEAR_BUILT 1956.09933 1956.85446 1.969576e+03
## LOT_SIZE 7826.73598 7193.81115 1.584944e+04
## SQUARE_FEET 1418.77763 1754.17407 2.858298e+03
##
## Variances:
## [,,1]
## PRICE DAYS_ON_MARKET YEAR_BUILT LOT_SIZE
## PRICE 6203302271.97 -1.646795e+05 96871.649785 104558399.623
## DAYS_ON_MARKET -164679.53 6.328525e+02 -6.158904 -2830.019
## YEAR_BUILT 96871.65 -6.158904e+00 8.355735 1175.552
## LOT_SIZE 104558399.62 -2.830019e+03 1175.551913 2320537.424
## SQUARE_FEET 13984854.88 -1.961888e+02 281.758561 229008.286
## SQUARE_FEET
## PRICE 13984854.8790
## DAYS_ON_MARKET -196.1888
## YEAR_BUILT 281.7586
## LOT_SIZE 229008.2855
## SQUARE_FEET 65471.8151
## [,,2]
## PRICE DAYS_ON_MARKET YEAR_BUILT LOT_SIZE
## PRICE 15023922025.9 -3.988369e+05 234653.972562 2.531856e+08
## DAYS_ON_MARKET -398836.9 5.461298e+02 -6.057833 -7.132905e+03
## YEAR_BUILT 234654.0 -6.057833e+00 355.189283 5.650821e+02
## LOT_SIZE 253185613.5 -7.132905e+03 565.082097 8.400783e+06
## SQUARE_FEET 33870770.7 -3.099189e+02 761.746460 5.193118e+05
## SQUARE_FEET
## PRICE 33870770.7400
## DAYS_ON_MARKET -309.9189
## YEAR_BUILT 761.7465
## LOT_SIZE 519311.7623
## SQUARE_FEET 188844.1271
## [,,3]
## PRICE DAYS_ON_MARKET YEAR_BUILT LOT_SIZE
## PRICE 56883073078.9 -1.509942e+06 889398.89910 957434006.62
## DAYS_ON_MARKET -1509942.3 6.887649e+02 -15.82653 -34030.10
## YEAR_BUILT 889398.9 -1.582653e+01 458.51636 -54879.96
## LOT_SIZE 957434006.6 -3.403010e+04 -54879.96388 101251369.00
## SQUARE_FEET 128255752.2 -1.935916e+03 3339.61103 1072249.31
## SQUARE_FEET
## PRICE 128255752.192
## DAYS_ON_MARKET -1935.916
## YEAR_BUILT 3339.611
## LOT_SIZE 1072249.312
## SQUARE_FEET 562594.734
Plot the results
plot(model, what = "classification")
